August 2019
Dataset: 10k PBMCs from a Healthy Donor available from 10x Genomics (here).
This notebook uses a loom file generated from the first part of the SCENIC protocol, described in: PBMC10k_SCENIC-protocol-CLI.ipynb
# import dependencies
import os
import numpy as np
import pandas as pd
import scanpy as sc
import loompy as lp
from MulticoreTSNE import MulticoreTSNE as TSNE
import json
import base64
import zlib
from pyscenic.plotting import plot_binarization
from pyscenic.export import add_scenic_metadata
from pyscenic.cli.utils import load_signatures
import matplotlib as mpl
import matplotlib.pyplot as plt
from scanpy.plotting._tools.scatterplots import plot_scatter
import seaborn as sns
# set a working directory
wdir = '/ddn1/vol1/staging/leuven/stg_00002/lcb/cflerin/testruns/scenic-protocol/releasetest2/'
os.chdir( wdir )
# path to loom output, generated from a combination of Scanpy and pySCENIC results:
f_final_loom = 'PBMC10k.loom'
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=150)
# scenic output
lf = lp.connect( f_final_loom, mode='r', validate=False )
meta = json.loads(zlib.decompress(base64.b64decode( lf.attrs.MetaData )))
exprMat = pd.DataFrame( lf[:,:], index=lf.ra.Gene, columns=lf.ca.CellID).T
auc_mtx = pd.DataFrame( lf.ca.RegulonsAUC, index=lf.ca.CellID)
# create a dictionary of regulons:
regulons = {}
for i,r in pd.DataFrame(lf.ra.Regulons,index=lf.ra.Gene).iteritems():
regulons[i] = list(r[r==1].index.values)
# cell annotations from the loom column attributes:
cellAnnot = pd.concat(
[
pd.DataFrame( lf.ca.Celltype_Garnett, index=lf.ca.CellID ),
pd.DataFrame( lf.ca.ClusterID, index=lf.ca.CellID ),
pd.DataFrame( lf.ca.Louvain_clusters_Scanpy, index=lf.ca.CellID ),
pd.DataFrame( lf.ca.Percent_mito, index=lf.ca.CellID ),
pd.DataFrame( lf.ca.nGene, index=lf.ca.CellID ),
pd.DataFrame( lf.ca.nUMI, index=lf.ca.CellID ),
],
axis=1
)
cellAnnot.columns = [
'Celltype_Garnett',
'ClusterID',
'Louvain_clusters_Scanpy',
'Percent_mito',
'nGene',
'nUMI']
# capture embeddings:
dr = [
pd.DataFrame( lf.ca.Embedding, index=lf.ca.CellID )
]
dr_names = [
meta['embeddings'][0]['name'].replace(" ","_")
]
# add other embeddings
drx = pd.DataFrame( lf.ca.Embeddings_X, index=lf.ca.CellID )
dry = pd.DataFrame( lf.ca.Embeddings_Y, index=lf.ca.CellID )
for i in range( len(drx.columns) ):
dr.append( pd.concat( [ drx.iloc[:,i], dry.iloc[:,i] ], sort=False, axis=1, join='outer' ))
dr_names.append( meta['embeddings'][i+1]['name'].replace(" ","_").replace('/','-') )
# rename columns:
for i,x in enumerate( dr ):
x.columns = ['X','Y']
lf.close()
scanpy.AnnData object¶This can be done directly from the integrated loom file, with a few modifications to allow for SCENIC- and SCope-specific loom attributes:
adata = sc.read( f_final_loom, validate=False)
# drop the embeddings and extra attributes from the obs object
adata.obs.drop( ['Embedding','Embeddings_X','Embeddings_Y','RegulonsAUC'], axis=1, inplace=True )
# add the embeddings into the adata.obsm object
for i,x in enumerate( dr ):
adata.obsm[ 'X_'+dr_names[i] ] = x.as_matrix()
sc.utils.sanitize_anndata( adata )
scanpy.AnnData object.¶# load the regulons from a file using the load_signatures function
sig = load_signatures('reg.csv')
adata = add_scenic_metadata(adata, auc_mtx, sig)
# helper functions (not yet integrated into pySCENIC):
from pyscenic.utils import load_motifs
import operator as op
from IPython.display import HTML, display
BASE_URL = "http://motifcollections.aertslab.org/v9/logos/"
COLUMN_NAME_LOGO = "MotifLogo"
COLUMN_NAME_MOTIF_ID = "MotifID"
COLUMN_NAME_TARGETS = "TargetGenes"
def display_logos(df: pd.DataFrame, top_target_genes: int = 3, base_url: str = BASE_URL):
"""
:param df:
:param base_url:
"""
# Make sure the original dataframe is not altered.
df = df.copy()
# Add column with URLs to sequence logo.
def create_url(motif_id):
return '<img src="{}{}.png" style="max-height:124px;"></img>'.format(base_url, motif_id)
df[("Enrichment", COLUMN_NAME_LOGO)] = list(map(create_url, df.index.get_level_values(COLUMN_NAME_MOTIF_ID)))
# Truncate TargetGenes.
def truncate(col_val):
return sorted(col_val, key=op.itemgetter(1))[:top_target_genes]
df[("Enrichment", COLUMN_NAME_TARGETS)] = list(map(truncate, df[("Enrichment", COLUMN_NAME_TARGETS)]))
MAX_COL_WIDTH = pd.get_option('display.max_colwidth')
pd.set_option('display.max_colwidth', 200)
display(HTML(df.head().to_html(escape=False)))
pd.set_option('display.max_colwidth', MAX_COL_WIDTH)
df_motifs = load_motifs('reg.csv')
selected_motifs = ['PAX5','TCF3','EBF1']
df_motifs_sel = df_motifs.iloc[ [ True if x in selected_motifs else False for x in df_motifs.index.get_level_values('TF') ] ,:]
#display_logos(df_motifs.head())
display_logos( df_motifs_sel.sort_values([('Enrichment','NES')], ascending=False).head(9))
sc.set_figure_params(frameon=False, dpi=600, fontsize=10, dpi_save=600)
sc.pl.scatter( adata, basis='highly_variable_genes_UMAP',
color=['Louvain_clusters_Scanpy','Celltype_Garnett'],
title=['HVG - UMAP (Louvain clusters)','HVG - UMAP (Cell type)'],
alpha=0.8,
save='_Louvain-celltype.pdf'
)
sc.pl.scatter( adata, basis='SCENIC_AUC_UMAP',
color=['Louvain_clusters_Scanpy','Celltype_Garnett'],
title=['SCENIC - UMAP (Louvain clusters)','SCENIC - UMAP (Cell type)'],
alpha=0.8,
save='_Louvain-celltype.pdf'
)